In [1]:
import pandas as pd
pd.options.display.max_columns = 200
import numpy as np
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
# увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 18, 6
rcParams['font.size'] = 16
rcParams['axes.labelsize'] = 14
rcParams['xtick.labelsize'] = 13
rcParams['ytick.labelsize'] = 13
Зайдите на https://www.drivendata.org/ и зарегистрируйтесь. Для сегодняшней домашки будем данные брать именно отсюда.
Нас интересует конкурс https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/page/23/ . В нем представлены данные, собранные Taarifa и Танзанийским Министерством Воды и Ирригации.
Постановка задачи следующая: На территории Танзании установлено множество водяных насосов, которые спасают местное население от жажды. В зависимости от того, кем и когда установлен насос, а также зная, как им распоряжаются, можно попытаться предположить, какие из них функционируют, какие нуждаются в ремонте и какие не работают вовсе.
Этим мы и займемся, а заодно и прокачаемся в подборе гиперпараметров алгоритмов.
In [2]:
train_X, train_y = pd.read_csv( # путь к вашему файлу train.csv
'data/WaterTable/train.csv'
), pd.read_csv( # путь к вашему файлу trainLabels.csv
'data/WaterTable/trainLabels.csv'
)
df = pd.merge(train_X, train_y, how='left')
df_test = pd.read_csv( # путь к вашему файлу test.csv
'data/WaterTable/test.csv'
)
In [3]:
df.head()
Out[3]:
Предобработка - опциональный блок, и у себя подготовить данные вы можете полностью по-своему.
Единственное замечание: если решите подготавливать данные самостоятельно, замените странную строку "other - mkulima/shinyanga"
на просто "other"
, так как в тесте только "other"
.
df.loc[df.extraction_type == 'other - mkulima/shinyanga', 'extraction_type'] = 'other'
In [4]:
def reduce_factor_levels(df, column_name, limit=None, top=None, name=None):
assert(limit is not None or top is not None), 'Specify limit ot top'
if top is None:
top = df[column_name].value_counts()[:limit].index
if name is None:
name = '%s_OTHER' % column_name
df.loc[~df[column_name].isin(top), column_name] = name
return top
In [5]:
top = reduce_factor_levels(df, 'funder', 10)
reduce_factor_levels(df_test, 'funder', top=top);
top = reduce_factor_levels(df, 'installer', 10)
reduce_factor_levels(df_test, 'installer', top=top);
In [6]:
#drop = ['wpt_name', 'num_private', 'subvillage', 'region_code', 'district_code', 'lga', 'ward', 'recorded_by', 'scheme_name']
drop = ['wpt_name', 'num_private', 'district_code', 'region_code', 'subvillage'] #
df.drop(drop, axis=1, inplace=True)
df_test.drop(drop, axis=1, inplace=True)
In [7]:
df.loc[df.scheme_management == 'None', 'scheme_management'] = ''
df.loc[df.scheme_management.isnull(), 'scheme_management'] = ''
df_test.loc[df_test.scheme_management.isnull(), 'scheme_management'] = ''
In [8]:
df['construction_date_known'] = (df.construction_year > 0).astype(np.int32)
df_test['construction_date_known'] = (df_test.construction_year > 0).astype(np.int32)
In [9]:
min_year = df[df.construction_year > 0].construction_year.min() // 10 - 1
df['construction_decade'] = df.construction_year // 10 - min_year
df_test['construction_decade'] = df_test.construction_year // 10 - min_year
df.loc[df.construction_decade < 0, 'construction_decade'] = 0
df_test.loc[df_test.construction_decade < 0, 'construction_decade'] = 0
In [10]:
top = reduce_factor_levels(df, 'construction_year', 20)
reduce_factor_levels(df_test, 'construction_year', top=top);
In [11]:
df.loc[df.extraction_type == 'other - mkulima/shinyanga', 'extraction_type'] = 'other'
In [12]:
heights = np.arange(-1, df.gps_height.max()+500, 500)
height_labels = list(range(len(heights)-1))
df['gps_height_rounded'] = pd.cut(df.gps_height, bins=heights, labels=height_labels)
df_test['gps_height_rounded'] = pd.cut(df_test.gps_height, bins=heights, labels=height_labels)
#df.drop(['gps_height'], axis=1, inplace=True)
#df_test.drop(['gps_height'], axis=1, inplace=True)
In [13]:
#pops = np.arange(-1, df.population.max()+500, 500)
#pops_labels = list(range(len(pops)-1))
#df['pop_rounded'] = pd.cut(df.population, bins=pops, labels=pops_labels)
#df_test['pop_rounded'] = pd.cut(df_test.population, bins=pops, labels=pops_labels)
#df.drop(['population'], axis=1, inplace=True)
#df_test.drop(['population'], axis=1, inplace=True)
In [14]:
#df.drop(['date_recorded'], axis=1, inplace=True)
#df_test.drop(['date_recorded'], axis=1, inplace=True)
In [15]:
df.public_meeting.fillna(True, inplace=True)
df_test.public_meeting.fillna(True, inplace=True)
In [16]:
df.permit.fillna(True, inplace=True)
df_test.permit.fillna(True, inplace=True)
In [17]:
df.gps_height_rounded.fillna(0, inplace=True)
df_test.gps_height_rounded.fillna(0, inplace=True)
In [17]:
df.head()
Out[17]:
In [114]:
df.quality_group.value_counts()
Out[114]:
In [115]:
quality_groupInRegions = df.groupby('region')['quality_group'].value_counts().to_dict()
results = pd.DataFrame(data = quality_groupInRegions, index=[0]).stack().fillna(0).transpose()
results.columns = pd.Index(['good', 'salty', 'unknown', 'milky', 'colored', 'fluoride'])
results['total'] = results.good + results.salty + results.unknown + results.milky + results.colored + results.fluoride
results.sort_values(by='good', ascending=False, inplace=True)
results[['good', 'salty', 'unknown', 'milky', 'colored', 'fluoride']].plot(kind='bar', stacked=True, rot=45);
In [116]:
from folium import Map, CircleMarker
import colorsys
#Просто карта
tanzania_map = Map(location=(-2.147466, 34.698766), tiles='Mapbox Bright', zoom_start=6)
#tanzania_map
In [117]:
#в качестве target сделаем текущий статус
df.status_group.value_counts()
Out[117]:
In [118]:
df['target'] = 0 #non functional
df.loc[df.status_group == 'functional needs repair', 'target'] = 1
df.loc[df.status_group == 'functional', 'target'] = 2
df.head()
Out[118]:
In [119]:
# Карта с отмеченными источниками
get_radius = lambda x: (x - min_)/(max_ - min_)*7 + 3
rgbhex = lambda rgb: '#'+"".join("%02X" % i for i in rgb)
get_fill_color = lambda x: rgbhex(tuple(int(i * 255) for i in \
colorsys.hsv_to_rgb(x/max_*120.0/360.0, 0.56, 0.84)))
get_border_color = lambda x: rgbhex(tuple(int(i * 255) for i in \
colorsys.hsv_to_rgb(x/max_*120.0/360.0, 0.78, 0.36)))
add_marker = lambda lat, lon, target: \
CircleMarker((lat, lon),
radius = get_radius(target),
color = get_border_color(target),
fill_color = get_fill_color(target),
popup='Lat: %.3f; Lon: %.3f' % (lat, lon),
)\
.add_to(tanzania_map)
In [120]:
min_, max_ = df[['target']].describe().loc['min'][0], df[['target']].describe().loc['max'][0]
In [121]:
df.sample(n=1000).apply(lambda row: add_marker(row['latitude'], row['longitude'], row['target']), axis=1);
In [17]:
#tanzania_map
In [123]:
df = df.drop(['target'], axis=1)
df.head()
Out[123]:
In [18]:
X, y, X_test = df.drop(['id', 'status_group'], axis=1), \
df.status_group, \
df_test.drop(['id'], axis=1)
In [19]:
X.head(1)
Out[19]:
Вот эта функция ниже - опять мои штуки-дрюки, и можно кодировать данные по-своему.
In [20]:
def prepare(X_train, X_test):
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
objects = X_train.select_dtypes(include=['O']).columns.values
numeric = X_train.select_dtypes(exclude=['O']).columns.values
dv = DictVectorizer(sparse=False)
data_encoded_tr = dv.fit_transform(X_train[objects].to_dict(orient='records'))
data_encoded_ts = dv.transform(X_test[objects].to_dict(orient='records'))
ss = StandardScaler()
data_scaled_tr = ss.fit_transform(X_train[numeric])
data_scaled_ts = ss.transform(X_test[numeric])
train = np.hstack((data_encoded_tr, data_scaled_tr))
test = np.hstack((data_encoded_ts, data_scaled_ts))
return train, test
In [21]:
x_train, x_test = prepare(X, X_test)
In [22]:
from sklearn.preprocessing import LabelEncoder
In [23]:
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)
In [24]:
x_train.shape
Out[24]:
In [25]:
x_test.shape
Out[25]:
Возьмите тетрадку с сегодняшнего занятия и, руководствуясь советами по настройке, заделайте лучший GBM в мире! Не забудьте отправлять результаты на drivendata и хвастаться в чате о результатах.
In [254]:
SEED = 1234
np.random.seed = SEED
In [27]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
In [258]:
clf = GradientBoostingClassifier(random_state=SEED, n_estimators=10, learning_rate=0.01,subsample=0.8, max_depth=4)
scores = cross_val_score(clf, x_train, y)
np.mean(scores), 2*np.std(scores)
Out[258]:
In [259]:
clf = clf.fit(x_train, y)
print('Mean score:', scores.mean())
In [260]:
y_te = clf.predict(x_test)
y_te
Out[260]:
In [261]:
ans_nn2 = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(y_te)})
ans_nn2.head()
Out[261]:
In [262]:
ans_nn.to_csv('ans_gbm.csv', index=False)
Выберите любой из сторонних фреймворков по своему усмотрению:
Установите, прокачайте его, побейте GBM от sklearn.
In [26]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
In [64]:
#clf2 = LGBMClassifier(max_bin=475,learning_rate=0.13,n_estimators=140,num_leaves=131)
#clf2 = LGBMClassifier(max_bin=400,learning_rate=0.13,n_estimators=140,num_leaves=131)
clf2 = LGBMClassifier(max_bin=400,learning_rate=0.134,n_estimators=151,num_leaves=131)
scores = cross_val_score(clf2, x_train, y)
np.mean(scores), 2*np.std(scores)
Out[64]:
In [65]:
clf2 = clf2.fit(x_train, y)
In [66]:
y_te = clf2.predict(x_test)
y_te
Out[66]:
In [67]:
ans_nn = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(y_te)})
ans_nn.head()
Out[67]:
In [68]:
ans_nn.to_csv('ans_lightgbm.csv', index=False)
In [265]:
from catboost import Pool, CatBoostClassifier
In [266]:
clf3 = CatBoostClassifier(random_seed=SEED, iterations=500, learning_rate=0.03, depth=6)
scores = cross_val_score(clf3, x_train, y, n_jobs=-1)
np.mean(scores), 2*np.std(scores)
Out[266]:
In [267]:
clf3 = clf3.fit(x_train, y)
In [286]:
y_te = clf3.predict(x_test)
y_te
Out[286]:
In [290]:
arr = []
i = 0
while i < len(y_te):
arr.append(int(y_te[i]))
i += 1
#arr
In [291]:
ans_nn = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(arr)})
ans_nn.head()
Out[291]:
In [292]:
ans_nn.to_csv('ans_catboost.csv', index=False)
In [27]:
import h2o
In [31]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from sklearn.model_selection import cross_val_score
In [29]:
h2o.init()
In [ ]: